#!/usr/bin/python

import sys,os,re,glob
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from pylab import *
import traceback
import resource 
resource.setrlimit(resource.RLIMIT_DATA,(2e9,2e9))

import ocrolib
from ocrolib import plotutils
from ocrolib import hocr
from ocrolib import RecognitionError
import ocropreproc
import ocrorast
import ocrofst

ion()
hold(False)

def alert(*args):
    sys.stderr.write(" ".join([str(x) for x in args]))
    sys.stderr.write("\n")

from optparse import OptionParser
parser = OptionParser(usage="""
%prog [options] image1 image2 ...

IMPORTANT: This is not the preferred way of running OCRopus.  Instead,
use the ocropus-{binarize,pseg,lattice,align,hocr} commands.

===

Recognize pages using built-in OCRopus components.  This first
uses the page cleaner, then the page segmenter, then the line
recognizers, and finally the language model.

The following components take files as arguments, and those files
are loaded in various ways.

--linerec -- line recognizer (.pymodel, .cmodel, or .model)
--langmod -- language model (OpenFST language model dump)

If you want to see what's going on, run ocropus-pages with the "-d" option
("-D" for continuous output, but this slows down recognition significantly).
With the "-L" option, you also see each text line as it's being recognized.

(For even more insight into what is going on during recognition, use the
ocropus-showpsegs and ocropus-showlrecs commands.)

Advanced Usage:

You can choose from a number of components for the different
processing stages.  See the output of "ocropus components" for your
choices.

Possible choices are:

--clean (ICleanupGray) -- binarization, denoising, deskewing
--pseg (ISegmentPage) -- page segmentation
--ticlass (ITextImageSegmentation) -- text/image segmentation (off by default)

For each component, you can pass additional parameters.  For example,
--clean StandardPreprocessing:rmbig_minaspect=0.1 uses an instance of
StandardPreprocessing for cleanup and sets its rmbig_minaspect
parameter to 0.1.  You can see a list of all the parameters with
"ocropus params StandardPreprocessing".

Instead of component names, you can also pass the names of
constructors of Python classes for each of those components, as in
"--clean my.CleanupPage:threshold=0.3" or "--clean
my.CleanupPage(0.3)".  This will import the "my" package and then call
the constructor.
""")

parser.add_option("-C","--clean",help="page cleaner",default="ocropreproc.CommonPreprocessing")
parser.add_option("-P","--pseg",help="line segmenter",default="ocrorast.SegmentPageByRAST")
parser.add_option("-T","--ticlass",help="text image segmenter",default=None)
parser.add_option("-m","--linerec",help="linerec model",default="default.cmodel")
parser.add_option("-l","--langmod",help="langmod",default="default.fst")
parser.add_option("-w","--lweight",help="weight for the language model",type="float",default=1.0)
parser.add_option("-v","--verbose",help="verbose",action="store_true")
parser.add_option("-x","--hocr",help="output XHTML+hOCR",action="store_true")
parser.add_option("-p","--plain",help="output plain text",action="store_true")
parser.add_option("-r","--dpi",help="resolution in dpi",default=200,type=int)
parser.add_option("-S","--silent",action="store_true",help="disable warnings")
parser.add_option("-d","--display",help="display result",action="store_true")
parser.add_option("-D","--Display",help="display continuously",action="store_true")
parser.add_option("-L","--displaylines",help="display lines as well",action="store_true")
parser.add_option("-B","--beam",help="size of beam in beam search",type="int",default=1000)
(options,args) = parser.parse_args()

if len(args)==0:
    parser.print_help()
    sys.exit(0)

# FIXME add language model weights
assert options.lweight==1.0,"other language model weights not implemented yet"

if options.Display: options.display = 1
if options.display: ion()
if options.displaylines:
    sys.stderr.write("[note] recognizer runs significantly slower with -L flag (line display)\n")

# Create/load the various recognition components.  Note that you can pass parameters
# to any of these using the syntax documented under ocrolib.make_component.

# The preprocessor: removes noise, performs page deskewing, and other cleanup.
preproc = ocrolib.make_IBinarize(options.clean)

# The page segmenter.
segmenter = ocrolib.make_ISegmentPage(options.pseg)

# The line recognizer.  Note that this is loaded, not instantiated.
# You can pass x.model, x.cmodel, and x.pymodel, which loads a C++
# line recognizer, a C++ character recognizer, or a pickled Python line
# recognizer respectively.
linerec = ocrolib.load_linerec(ocrolib.ocropus_find_file(options.linerec))
linerec.norejects = 1
alert("[note]","line recognizer:",linerec)

# The language model, loaded from disk.
if options.langmod=="none":
    lmodel = None
else:
    lmodel = ocrofst.OcroFST()
    lmodel.load(ocrolib.ocropus_find_file(options.langmod))

# The text/image segmenter, if given.
ticlass = None
if options.ticlass is not None:
    ticlass = ocrolib.make_ITextImageClassification(options.ticlass)

# Now start the output with printing the hOCR header if hOCR output has been requested.
if options.hocr:
    print hocr.header()

# Iterate through the pages specified by the argument.  Since this can be somewhat tricky
# with TIFF files, we use the page_iterator abstraction that takes care of all the special
# cases.  But basically, this just gives us one gray image after another, plus the file name.
pageno = 0
for page_gray,pagefile in ocrolib.page_iterator(args):
    pageno += 1
    sys.stderr.write("[note] *** %d %s ***\n"%(pageno,pagefile))

    # Output geometric page information.
    # FIXME add: bbox, ppageno
    if options.hocr: 
        print "<div class='ocr_page' id='page_%d' ppageno='%d' image='%s'>"% \
            (pageno,pageno,pagefile)

    # Perform cleanup and binarization of the page.
    page_bin,page_gray = preproc.binarize(page_gray)

    if not options.silent:
        if ocrolib.quick_check_page_components(page_bin,dpi=options.dpi)<0.5:
            continue

    # Black out images in the binary page image.
    # This will cause images to be treated as non-text blocks
    # by the page segmenter.
    if ticlass is not None:
        ocrolib.blackout_images(page_bin,ticlass)

    # Perform page segmentation into text columns and text lines.
    page_seg = segmenter.segment(page_bin)

    # For debugging purposes, display the segmented page image.
    if options.display: 
        clf()
        axis = subplot(111)
        axis.imshow(page_bin,cmap=cm.gray)
        # plotutils.draw_pseg(page_seg,axis)
        draw()
        if not options.Display: 
            raw_input("hit ENTER to continue")
        else:
            ginput(1,timeout=0.1)

    # Now iterate through the text lines of the page, in reading order.
    # We use the RegionExtractor utility class for that.  The page_seg image
    # is just an RGB image, see http://docs.google.com/Doc?id=dfxcv4vc_92c8xxp7
    regions = ocrolib.RegionExtractor()
    regions.setPageLines(page_seg)

    # If there are too many text lines, probably something went wrong with the
    # page segmentation. (FIXME: make this more flexible)
    if regions.length()>150:
        alert("[error] too many lines (%d), probably bad input; skipping"%regions.length())
        continue

    alert("[note]",pagefile,"lines:",regions.length())
    for i in range(1,regions.length()):
        # Extract the line image and optionally display it.
        line = regions.extract(page_bin,i,1)
        if options.display and options.displaylines:
            clf(); subplot(111); imshow(line,cmap=cm.gray); ginput(1,timeout=0.1)

        # Perform some simple sanity checks on the text line image; skip anything
        # that doesn't look like a real text line.
        if not options.silent:
            if ocrolib.quick_check_line_components(line,dpi=options.dpi)<0.5:
                continue

        # Now perform the actual text line recognition.  The variables fst and rseg
        # hold the recognition lattice and raw segmentation that the line recognizer
        # computes.

        try:
            fst,rseg = linerec.recognizeLineSeg(line)
        except RecognitionError,e:
            print "FAILED:",re.sub('\n',' ',str(e)[:80])
        except:
            traceback.print_exc()
            alert("[error]","line recognizer failed")
            continue


        # Next is the application of the language model.  We perform a beam search
        # through the recognition lattice and language model.  The result is
        # stored in an iulib string.
        if lmodel is None:
            s = fst.bestpath() 
            if s is None:
                continue
            cost = 0.0
        else:
            fst.save("_pages.fst")
            # lmodel.save("_lmodel.fst")
            s,cost = ocrofst.beam_search_simple(fst,lmodel,options.beam)
            if cost>999999.0:
                alert("[warn]","beam search didn't find a solution (line not in language model)")
                s = fst.bestpath()
                cost = 999.0

        # Output the text line in various formats.
        if options.hocr:
            # For hOCR output, we obtain the line bounding box from the region extractor.
            # If we wanted to output word or character bounding boxes, we'd have to
            # add that information here.
            x0,y0,x1,y1 = regions.bboxMath(i)
            print "<span class='ocr_line' bbox='%d %d %d %d'>%s</span>"%(x0,y0,x1,y1,s)
        elif options.plain:
            # For plain output, just output the string itself.
            print s
        else:
            # The default output gives the cost, length, and string.
            print "%6.2f\t%3d\t%s"%(cost,len(s),s)

        # If display is on, we show the recognized line.  (Note that this slows
        # down recognition considerably because Matplotlib isn't very fast.)
        if options.display and options.displaylines:
            axis = subplot(111)
            # plotutils.draw_linerec(line,fst,rseg,lmodel,axis)
            if not options.Display: 
                raw_input("hit ENTER to continue")
            else:
                ginput(1,timeout=0.1) # here to flush the display; times out quickly

    # Close off the DIV for the page.
    if options.hocr:
        print "</div>"

if options.hocr:
    print hocr.footer()
